sorting list (for the extra bonus of the homework)



In [24]:

    
x = ["aardvark", "bee", 'croco', 'duck', "emo"]



In [25]:

    
#sorted by second letter ['aardvark', 'bee', 'emu', 'crocodrile; 'duck' ]
sorted(x, reverse=True)









    Out[25]:





['emo', 'duck', 'croco', 'bee', 'aardvark']



In [26]:

    
#sorted(x, key=???) when you want to sort by the second letter of the list.



In [27]:

    
def get_second_letter(s):
    return s[1]



In [28]:

    
get_second_letter("cheese")









    Out[28]:





'h'



In [29]:

    
sorted(x, key=get_second_letter)









    Out[29]:





['aardvark', 'bee', 'emo', 'croco', 'duck']

Lambda functions!

a way of writing a function on a single line



In [30]:

    
#normal function
def get_second_letter(s):
    return s[1]



In [31]:

    
get_second_letter = lambda s: s[1]



In [32]:

    
get_second_letter("hello")









    Out[32]:





'e'



In [35]:

    
sorted(x, key = lambda s: s[1])









    Out[35]:





['aardvark', 'bee', 'emo', 'croco', 'duck']



In [19]:

    
# [P['name'] for p in sorted(planets, hey=lambda x: x['moons'])]



In [20]:

    
# def get moon_count(d):
#     return d['moons']
# sorted(planets, key=get_moon_count)



In [21]:

    
#written in  SQL:
#Select name from planet order by moons

tuple



In [39]:

    
t = [5]



In [40]:

    
for item in t:
    print(item * item)



In [41]:

    
t.append(30)



In [42]:

    
carefree_list = [5, 33, 32, 66, 44]



In [43]:

    
carefree_list[1] = 'Mr Fluffypants'



In [45]:

    
carefree_list









    Out[45]:





[5, 'Mr Fluffypants', 32, 66, 44]



In [48]:

    
t[1] = 'Mr. Fluffypants'



In [49]:

    
t









    Out[49]:





[5, 'Mr. Fluffypants']



In [50]:

    
#inmutable data type
#one benefit is exactly that: cant be changed
#other benefit is that tuples are memory-effcient



In [51]:

    
import sys



In [52]:

    
hello = [1, 2, 3]



In [54]:

    
sys.getsizeof(hello)









    Out[54]:





88

Back to regular expressions for a moment

Grouping with multiple matches in th same string



In [59]:

    
test = "one 1 two 2 three 3 four 4 five 5"



In [63]:

    
import re
re.findall(r"\w+ \d", test)









    Out[63]:





['one 1', 'two 2', 'three 3', 'four 4', 'five 5']



In [64]:

    
for item in re.findall(r"(\w+) (\d)", test):









    



  File "<ipython-input-64-b4449377acf8>", line 1
    for item in re.findall(r"(\w+) (\d)", test):
                                                ^
SyntaxError: unexpected EOF while parsing



In [65]:

    
all_subjects = open("enronsubjects.txt").read()



In [68]:

    
[item[0] for item in re.findall (r"(\d{3})-(\d{3})-(\d{4})", all_subjects)]









    Out[68]:





['713',
 '713',
 '713',
 '713',
 '713',
 '713',
 '713',
 '713',
 '713',
 '713',
 '713',
 '281',
 '713',
 '713',
 '713',
 '713',
 '713',
 '713',
 '281',
 '713',
 '713',
 '713',
 '614',
 '713',
 '303',
 '281',
 '800',
 '800',
 '888']

Monetary amounts in the subjects lines

match something like $10 m,k,b



In [70]:

    
re.findall(r"\$(\d+) ?(\w+)", all_subjects)









    Out[70]:





[('22', '8'),
 ('22', '8'),
 ('10', 'M'),
 ('10', 'M'),
 ('10', 'M'),
 ('10', 'M'),
 ('6', '8'),
 ('25', 'million'),
 ('25', 'million'),
 ('25', 'million'),
 ('25', 'million'),
 ('25', 'million'),
 ('25', 'million'),
 ('25', 'million'),
 ('25', 'million'),
 ('25', 'million'),
 ('25', 'million'),
 ('25', 'million'),
 ('25', 'million'),
 ('25', 'million'),
 ('25', 'million'),
 ('82', '0'),
 ('82', '0'),
 ('40', 'Million'),
 ('27', 'Billion'),
 ('27', 'Billion'),
 ('5', '0'),
 ('5', '0'),
 ('89', '5'),
 ('89', '5'),
 ('1', '9'),
 ('1', '9'),
 ('1', '9'),
 ('1', '9'),
 ('870', 'K'),
 ('870', 'K'),
 ('14', '1'),
 ('14', '1'),
 ('21', 'billion'),
 ('6', 'million'),
 ('14', 'bln'),
 ('14', 'bln'),
 ('100', 'PRICE'),
 ('250', 'Cap'),
 ('350', 'MM'),
 ('1', '2'),
 ('1', '2'),
 ('1', '2'),
 ('1', '2'),
 ('10', 'Three'),
 ('70', '0'),
 ('70', '0'),
 ('70', '0'),
 ('10', 'you'),
 ('10', 'you'),
 ('13', 'B'),
 ('13', 'B'),
 ('100', 'on'),
 ('500', 'k'),
 ('500', 'k'),
 ('500', 'k'),
 ('500', 'k'),
 ('500', 'k'),
 ('500', 'k'),
 ('500', 'k'),
 ('500', 'k'),
 ('500', 'k'),
 ('2', 'Billion'),
 ('2', 'Billion'),
 ('2', 'Billion'),
 ('2', 'Billion'),
 ('97', '1'),
 ('97', '1'),
 ('97', '1'),
 ('97', '1'),
 ('97', '1'),
 ('97', '1'),
 ('1', 'Billion'),
 ('1', 'Billion'),
 ('39', 'in'),
 ('39', 'in'),
 ('1', '0'),
 ('1', '0'),
 ('14', '9'),
 ('5', '0'),
 ('5', '0'),
 ('5', '0'),
 ('2', '1'),
 ('21', 'P'),
 ('550', 'Million'),
 ('455', 'Million'),
 ('5', 'million'),
 ('5', 'million'),
 ('5', 'million'),
 ('7', 'MM'),
 ('7', 'MM'),
 ('7', 'MM'),
 ('7', 'MM'),
 ('7', 'MM'),
 ('7', 'MM'),
 ('100', 'Price'),
 ('2', '0'),
 ('2', '0'),
 ('2', '0'),
 ('2', '0'),
 ('10', '0'),
 ('10', '0'),
 ('10', '0'),
 ('2', '0'),
 ('2', '0'),
 ('2', '9'),
 ('2', '9'),
 ('2', '9'),
 ('2', '9'),
 ('2', '9'),
 ('2', '9'),
 ('160', '0'),
 ('160', '0'),
 ('160', '0'),
 ('160', '0'),
 ('160', '0'),
 ('160', '0'),
 ('160', '0'),
 ('2', 'Billion'),
 ('2', 'Billion'),
 ('2', 'Billion'),
 ('6', '7'),
 ('100', 'mil'),
 ('50', 'per'),
 ('21', '2'),
 ('21', '2'),
 ('21', '2'),
 ('21', '2'),
 ('21', '2'),
 ('21', '2'),
 ('19', '5'),
 ('19', '5'),
 ('19', '5'),
 ('19', '5'),
 ('19', '5'),
 ('19', '5'),
 ('19', '5'),
 ('19', '5'),
 ('19', '5'),
 ('19', '5'),
 ('19', '5'),
 ('19', '5'),
 ('4', '2'),
 ('4', '2'),
 ('4', '2'),
 ('7', 'MM'),
 ('7', 'MM'),
 ('7', 'MM'),
 ('1', '0'),
 ('1', '0'),
 ('1', '6'),
 ('1', '6'),
 ('1', '6'),
 ('8', 'Million'),
 ('8', 'Million'),
 ('500', 'mm'),
 ('500', 'mm'),
 ('500', 'mm'),
 ('80', 'million'),
 ('80', 'million'),
 ('80', 'million'),
 ('80', 'million'),
 ('80', 'million'),
 ('80', 'million'),
 ('80', 'million'),
 ('50', 'M'),
 ('102', 'Target'),
 ('102', 'Target'),
 ('20', '0'),
 ('5', '0'),
 ('25', 'Million'),
 ('25', 'Million'),
 ('25', 'Million'),
 ('120', 'EXTRA'),
 ('120', 'EXTRA'),
 ('45', 'Million'),
 ('45', 'Million'),
 ('14', '7'),
 ('14', '7'),
 ('14', '7'),
 ('14', '7'),
 ('14', '7'),
 ('14', '7'),
 ('14', '7'),
 ('600', 'B'),
 ('600', 'B'),
 ('14', '7'),
 ('14', '7'),
 ('14', '7'),
 ('24', '0'),
 ('24', '0'),
 ('2', '2'),
 ('2', '2'),
 ('2', '2'),
 ('100', 'k'),
 ('7', '7'),
 ('18', '3'),
 ('130', 'Million'),
 ('130', 'Million'),
 ('130', 'Million'),
 ('1', 'mm'),
 ('1', '0'),
 ('1', '0'),
 ('1', '0'),
 ('1', '0'),
 ('1', '0'),
 ('1', '0'),
 ('128', 'Return'),
 ('128', 'Return')]



In [73]:

    
vals= []
for item in re.findall(r"\$(\d+) ?([mMbBkK])", all_subjects):
    multiplier = item[1].lower()
    number_val = int(item[0])
    if multiplier == 'K':
        number_val *= 1000
    elif multiplier == 'm':
        number_val *= 1000000
    elif multiplier == 'b':
        number_val *= 100000000
    vals.append(number_val)
sum(vals)









    Out[73]:





139151006340

substitution with regular expressions



In [74]:

    
message = "this is a test, this is only a test"



In [76]:

    
message.replace("this", "that").replace("text", "walrus")









    Out[76]:





'that is a test, that is only a test'



In [78]:

    
message = "This is a test, this is only a test"
re.sub(r"[Tt]his", "that", message)









    Out[78]:





'that is a test, that is only a test'



In [80]:

    
re.sub(r"\b\w+\b", "PIKACHU", message)









    Out[80]:





'PIKACHU PIKACHU PIKACHU PIKACHU, PIKACHU PIKACHU PIKACHU PIKACHU PIKACHU'



In [85]:

    
anon = re.sub(r"(\d{3})-(\d{3})-(\d{4})", r"\1-\2-XXXX", all_subjects)



In [89]:

    
re.findall(r"\d{3}-\d{3}-X{4}.{,20}", anon)









    Out[89]:





['713-853-XXXX',
 '713-222-XXXX',
 '713-222-XXXX',
 '713-222-XXXX',
 '713-222-XXXX',
 '713-222-XXXX',
 '713-222-XXXX',
 '713-222-XXXX',
 '713-222-XXXX',
 '713-222-XXXX',
 '713-222-XXXX',
 '281-296-XXXX',
 '713-851-XXXX',
 '713-345-XXXX',
 '713-345-XXXX',
 '713-345-XXXX',
 '713-345-XXXX',
 '713-345-XXXX',
 '281-367-XXXX or',
 '713-528-XXXX',
 '713-850-XXXXw/713-703-XXXXc and ',
 '614-888-XXXX',
 '713-767-XXXX re Debbie Chance',
 '303-571-XXXX',
 '281-537-XXXX (home)',
 '800-937-XXXX,',
 '800-937-XXXX and ask for the Jul',
 '888-296-XXXX, HC:']

HTML to SQL

scrapping websites



In [111]:

    
from urllib.request import urlretrieve
urlretrieve("https://raw.githubusercontent.com/ledeprogram/data-and-databases/master/menupages-morningside-heights.html", "menu.html")









    Out[111]:





('menu.html', <http.client.HTTPMessage at 0x109d3f400>)



In [99]:

    
#store:
#     *restaurant name
#     *price ($$$$$)
#     *cuisines

# every restauatrant has a `<tr>` that is a child of a the `<table>` tag with class `search-results`
# restaurant are in <td> tag with class= `name-address`
# restaurant names are un <a> tag inside that <td>
# restaurant price in a `span` insude an `<td> with a class `price`
# the cuisine of the restaurant is in a `<td> tag iwth no class, the fifth `<td> tag that is a child of a the restaurants `<tr>`

#target:
 
*list of diccionaties

[ 
  {'name: "Brads", price: 1, Cuisines: [coffee]},
  {}'name': "Cafe Nana", 'price'
    ]



In [100]:

    
# syntax: urlretreive(url, filename)



In [113]:

    
from bs4 import BeautifulSoup



In [120]:

    
raw_html = open("menu.html").read()
soup = BeautifulSoup(raw_html, "html.parser")



In [121]:

    
#Just the names



In [122]:

    
search_table = soup.find("table", {'class': 'search-results'})
table_body = search_table.find('tbody')
for tr_tag in table_body.find_all('tr'):
    name_adress_tag = tr_tag.find('td', {'class': 'name-adress'})
    a_tag = name_adress_tag.find('a')
    print(tr_tag)









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-122-957540aeafed> in <module>()
      3 for tr_tag in table_body.find_all('tr'):
      4     name_adress_tag = tr_tag.find('td', {'class': 'name-adress'})
----> 5     a_tag = name_adress_tag.find('a')
      6     print(tr_tag)

AttributeError: 'NoneType' object has no attribute 'find'



In [ ]:

    
#how about names and prices? and maybe the cuisine too



In [ ]:



In [123]:

    
search_table = sopu.find("table", {'class': 'search-results'})
table_body = search_table.find('tbody')
for tr_tag in table_body.find_all('tr'):
    # get restaurant name from the inside a td
    # restaurant name = get_name(tr_tag)
    name_adress_tag = tre_tag.find('td', {'class': 'name-adress'})
    a_tag = name_adress_tag.find('a')
    restaurant_name = a_tag.string
    # get the price from the span if present
    price_tag = tr.tag.find('td', {'class': 'price'})
    price_span_tag = price_tag.fid('span')
    if price_pan_tag:
    price = price_span_tag.string
    else:
        price = 0
    print(restaurant_name, price)









    



  File "<ipython-input-123-0a472f49bd4f>", line 13
    price = price_span_tag.string
        ^
IndentationError: expected an indented block



In [ ]:

    
# much organized code using functions



In [124]:

    
def get_name(tr_tag):
    name_adress_tag = tre_tag.find('td', {'class': 'name-adress'})
    a_tag = name_adress_tag.find('a')
    restaurant_name = a_tag.string
    return restaurant_name
def get_price(tr_tag):
    price_tag = tr.tag.find('td', {'class': 'price'})
    price_span_tag = price_tag.fid('span')
    if price_pan_tag:
    price = price_span_tag.string
    else:
        price = 0
    return price 
def get_cuisines(tr_tag):
    all_td_tag = tr_tag.find_all('td')
    cuisine_tag = all_td_tags[4]
    cuisines = int(cuisining_tag.string)
    if cuisines:
        cuisines_list = cuisines.split(", ")
    else:
        cuisines_list = []
    return cuisines_list









    



  File "<ipython-input-124-fc415126fe80>", line 10
    price = price_span_tag.string
        ^
IndentationError: expected an indented block



In [125]:

    
restaurants = []
search_table = sopu.find("table", {'class': 'search-results'})
table_body = search_table.find('tbody')
for tr_tag in table_body.find_all('tr'): 
    restaurant_name = get_name(tr_tag)
    price = get_price(tr_tag)
    cuisines = get_cuisines(tr_tag)
    rest_dict = {'name': restaurant_name, 'price': price, 'cuisines': cuisines }
    restaurants.append(rest_dict)
restaurants









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-125-de43f4e64670> in <module>()
      1 restaurants = []
----> 2 search_table = sopu.find("table", {'class': 'search-results'})
      3 table_body = search_table.find('tbody')
      4 for tr_tag in table_body.find_all('tr'):
      5     restaurant_name = get_name(tr_tag)

NameError: name 'sopu' is not defined



In [126]:

    
# we want a list of str with get_cousines as a function



In [127]:

    
import









    



  File "<ipython-input-127-451c6f6f942e>", line 1
    import
          ^
SyntaxError: invalid syntax



In [ ]: